{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "bfd21ebc-cb32-4307-8f87-9a59a91888bf",
   "metadata": {},
   "source": [
    "# Connect to Elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d4bdf858-7db7-4054-b8c2-b3da652a9ed2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'h2QwONxsT4Kt-lTRKmPrhg', 'version': {'number': '8.12.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '1665f706fd9354802c02146c1e6b5c0fbcddfbc9', 'build_date': '2024-01-11T10:05:27.953830042Z', 'build_snapshot': False, 'lucene_version': '9.9.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
     ]
    }
   ],
   "source": [
    "from elasticsearch import Elasticsearch\n",
    "from dotenv import load_dotenv\n",
    "import os\n",
    "from elasticsearch import Elasticsearch\n",
    "\n",
    "load_dotenv()\n",
    " \n",
    "elastic_user=os.getenv('ES_USER')\n",
    "elastic_password=os.getenv('ES_PASSWORD')\n",
    "elastic_endpoint=os.getenv(\"ES_ENDPOINT\")\n",
    "\n",
    "url = f\"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200\"\n",
    "client = Elasticsearch(url, ca_certs = \"./http_ca.crt\", verify_certs = True)\n",
    " \n",
    "print(client.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "92aa19bd-d524-4a80-aa81-ea0576a2a319",
   "metadata": {},
   "source": [
    "# Prepare for dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "861b1d42-bf54-4697-a162-446235e78fa4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import JSONLoader \n",
    "\n",
    "def metadata_func(record: dict, metadata: dict) -> dict:\n",
    "    metadata[\"name\"] = record.get(\"name\")\n",
    "    metadata[\"summary\"] = record.get(\"summary\")\n",
    "    metadata[\"url\"] = record.get(\"url\")\n",
    "    metadata[\"category\"] = record.get(\"category\")\n",
    "    metadata[\"updated_at\"] = record.get(\"updated_at\")\n",
    "\n",
    "    return metadata\n",
    "\n",
    "# For more loaders https://python.langchain.com/docs/modules/data_connection/document_loaders/\n",
    "# And 3rd party loaders https://python.langchain.com/docs/modules/data_connection/document_loaders/#third-party-loaders\n",
    "loader = JSONLoader(\n",
    "    file_path=\"workplace-docs.json\",\n",
    "    jq_schema=\".[]\",\n",
    "    content_key=\"content\",\n",
    "    metadata_func=metadata_func,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df39ad96-e997-42ff-a838-3f9f820c880b",
   "metadata": {},
   "source": [
    "# Load Model from huggingface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "992a62d4-fca8-4a9b-9aa8-c86855f8caeb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2024-02-07 15:47:51,334 INFO : Establishing connection to Elasticsearch\n",
      "2024-02-07 15:47:51,356 INFO : Connected to cluster named 'elasticsearch' (version: 8.12.0)\n",
      "2024-02-07 15:47:51,356 INFO : Loading HuggingFace transformer tokenizer and model 'sentence-transformers/all-MiniLM-L6-v2'\n",
      "STAGE:2024-02-07 15:48:57 34543:2762445 ActivityProfilerController.cpp:312] Completed Stage: Warm Up\n",
      "STAGE:2024-02-07 15:48:57 34543:2762445 ActivityProfilerController.cpp:318] Completed Stage: Collection\n",
      "STAGE:2024-02-07 15:48:57 34543:2762445 ActivityProfilerController.cpp:322] Completed Stage: Post Processing\n",
      "2024-02-07 15:48:58,423 INFO : Creating model with id 'sentence-transformers__all-minilm-l6-v2'\n",
      "2024-02-07 15:48:58,488 INFO : Uploading model definition\n",
      "100%|███████████████████████████████████████| 87/87 [00:03<00:00, 24.11 parts/s]\n",
      "2024-02-07 15:49:02,159 INFO : Uploading model vocabulary\n",
      "2024-02-07 15:49:02,203 INFO : Starting model deployment\n",
      "2024-02-07 15:49:03,339 INFO : Model successfully imported with id 'sentence-transformers__all-minilm-l6-v2'\n"
     ]
    }
   ],
   "source": [
    "MODEL_ID = \"sentence-transformers__all-minilm-l6-v2\"\n",
    "\n",
    "!eland_import_hub_model --url https://elastic:xnLj56lTrH98Lf_6n76y@localhost:9200 \\\n",
    "\t--hub-model-id sentence-transformers/all-MiniLM-L6-v2 \\\n",
    "\t--task-type text_embedding \\\n",
    "\t--ca-cert ./http_ca.crt \\\n",
    "\t--clear-previous \\\n",
    "\t--start"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e1781026-d1ec-417b-a2b7-f136ba3a0db0",
   "metadata": {},
   "source": [
    "# Setting up our Elasticsearch Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6205efcd-957a-45fe-893c-98e4f8a1ef01",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'nb_parent_retriever_index'})"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PIPELINE_ID = \"chunk_text_to_passages\"\n",
    "MODEL_DIMS = 384\n",
    "INDEX_NAME = \"nb_parent_retriever_index\"\n",
    "\n",
    "# Create the pipeline\n",
    "client.ingest.put_pipeline(\n",
    "  id=PIPELINE_ID, \n",
    "  processors=[\n",
    "    {\n",
    "      \"foreach\": {\n",
    "        \"field\": \"passages\",\n",
    "        \"processor\": {\n",
    "          \"inference\": {\n",
    "            \"field_map\": {\n",
    "              \"_ingest._value.text\": \"text_field\"\n",
    "            },\n",
    "            \"model_id\": MODEL_ID,\n",
    "            \"target_field\": \"_ingest._value.vector\",\n",
    "            \"on_failure\": [\n",
    "              {\n",
    "                \"append\": {\n",
    "                  \"field\": \"_source._ingest.inference_errors\",\n",
    "                  \"value\": [\n",
    "                    {\n",
    "                      \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n",
    "                      \"pipeline\": \"ml-inference-title-vector\",\n",
    "                      \"timestamp\": \"{{{ _ingest.timestamp }}}\"\n",
    "                    }\n",
    "                  ]\n",
    "                }\n",
    "              }\n",
    "            ]\n",
    "          }\n",
    "        }\n",
    "      }\n",
    "    }\n",
    "  ]\n",
    ")\n",
    "\n",
    "# Create the index\n",
    "client.indices.create( \n",
    "  index=INDEX_NAME, \n",
    "  settings={\n",
    "    \"index\": {\n",
    "      \"default_pipeline\": PIPELINE_ID\n",
    "    }\n",
    "  },\n",
    "  mappings={\n",
    "    \"dynamic\": \"true\",\n",
    "    \"properties\": {\n",
    "      \"passages\": {\n",
    "        \"type\": \"nested\",\n",
    "        \"properties\": {\n",
    "          \"vector\": {\n",
    "            \"properties\": {\n",
    "              \"predicted_value\": {\n",
    "                \"type\": \"dense_vector\",\n",
    "                \"index\": True,\n",
    "                \"dims\": MODEL_DIMS,\n",
    "                \"similarity\": \"dot_product\"\n",
    "              }\n",
    "            }\n",
    "          }\n",
    "        }\n",
    "      }\n",
    "    }\n",
    "  }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a9bd50b6-3e7d-40bb-880d-6650ff6de5d1",
   "metadata": {},
   "source": [
    "# Utils: Parent Child Splitter Function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "dcaeb562-33f2-4cbf-a064-7207533ea6c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "\n",
    "def parent_child_splitter(documents, chunk_size: int = 200):\n",
    "\n",
    "  child_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)\n",
    "\n",
    "  docs = []\n",
    "  for i, doc in enumerate(documents):\n",
    "    passages = []\n",
    "\n",
    "    for _doc in child_splitter.split_documents([doc]):\n",
    "        passages.append({\n",
    "            \"text\": _doc.page_content,\n",
    "        })\n",
    "\n",
    "    doc = {\n",
    "        \"content\": doc.page_content,\n",
    "        \"metadata\": doc.metadata,\n",
    "        \"passages\": passages\n",
    "    }\n",
    "    docs.append(doc)\n",
    "    \n",
    "  return docs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "63018645-d0b0-4089-b929-c994b3f2ed76",
   "metadata": {},
   "source": [
    "# Utils: Pretty Response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "98f2c79a-5f89-4f78-970f-b2c5e89866ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "def pretty_response(response, show_parent_text=False):\n",
    "  if len(response['hits']['hits']) == 0:\n",
    "      print('Your search returned no results.')\n",
    "  else:\n",
    "    for hit in response['hits']['hits']:\n",
    "      id = hit['_id']\n",
    "      score = hit['_score']\n",
    "      doc_title = hit['_source'][\"metadata\"]['name']\n",
    "      parent_text = \"\"\n",
    "\n",
    "      if show_parent_text:\n",
    "          parent_text = hit['_source'][\"content\"]\n",
    "\n",
    "      passage_text = \"\"\n",
    "\n",
    "      for passage in hit['inner_hits']['passages']['hits']['hits']:\n",
    "          passage_text += passage[\"fields\"][\"passages\"][0]['text'][0] + \"\\n\\n\"\n",
    "\n",
    "      pretty_output = (f\"\\nID: {id}\\nDoc Title: {doc_title}\\nparent text:\\n{parent_text}\\nPassage Text:\\n{passage_text}\\nScore: {score}\\n\")\n",
    "      print(pretty_output)\n",
    "      print(\"---\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a0064c31-178e-431e-9ad1-63d458216c0b",
   "metadata": {},
   "source": [
    "# Full Document, nested passages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "517209e5-95a6-4d80-a8b4-2d8255ae58a9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Indexed 15 documents with [] errors\n"
     ]
    }
   ],
   "source": [
    "from elasticsearch import helpers\n",
    "\n",
    "chunked_docs = parent_child_splitter(loader.load(), chunk_size=600)\n",
    "\n",
    "count, errors = helpers.bulk(\n",
    "  client, \n",
    "  chunked_docs,\n",
    "  index=INDEX_NAME\n",
    ")\n",
    "\n",
    "print(f\"Indexed {count} documents with {errors} errors\")\n",
    "\n",
    "import time\n",
    "time.sleep(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a42b61c-642a-45ad-b1a8-943e9a4712c6",
   "metadata": {},
   "source": [
    "# Perform a Nested Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "3af9c1e0-25a2-4c28-a453-081edebba7bc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "ID: pLOcgo0B8HkRDilJp-Mp\n",
      "Doc Title: Work From Home Policy\n",
      "parent text:\n",
      "\n",
      "Passage Text:\n",
      "Effective: March 2020\n",
      "Purpose\n",
      "\n",
      "The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\n",
      "Scope\n",
      "\n",
      "This policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\n",
      "Eligibility\n",
      "\n",
      "\n",
      "Score: 0.8483097\n",
      "\n",
      "---\n",
      "\n",
      "ID: q7Ocgo0B8HkRDilJp-Mp\n",
      "Doc Title: Intellectual Property Policy\n",
      "parent text:\n",
      "\n",
      "Passage Text:\n",
      "Purpose\n",
      "The purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\n",
      "\n",
      "Scope\n",
      "This policy applies to all employees, including full-time, part-time, temporary, and contract employees.\n",
      "\n",
      "\n",
      "Score: 0.7292882\n",
      "\n",
      "---\n",
      "\n",
      "ID: qLOcgo0B8HkRDilJp-Mp\n",
      "Doc Title: Company Vacation Policy\n",
      "parent text:\n",
      "\n",
      "Passage Text:\n",
      "Purpose\n",
      "\n",
      "The purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\n",
      "Scope\n",
      "\n",
      "This policy applies to all full-time and part-time employees who have completed their probationary period.\n",
      "Vacation Accrual\n",
      "\n",
      "\n",
      "Score: 0.7137784\n",
      "\n",
      "---\n",
      "\n",
      "ID: prOcgo0B8HkRDilJp-Mp\n",
      "Doc Title: Wfh Policy Update May 2023\n",
      "parent text:\n",
      "\n",
      "Passage Text:\n",
      "As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.\n",
      "\n",
      "\n",
      "Score: 0.70840275\n",
      "\n",
      "---\n",
      "\n",
      "ID: srOcgo0B8HkRDilJp-Mp\n",
      "Doc Title: New Employee Onboarding Guide\n",
      "parent text:\n",
      "\n",
      "Passage Text:\n",
      "Review benefits options: Carefully review the benefits package and choose the options that best meet your needs.\n",
      "Complete enrollment forms: Fill out the necessary forms to enroll in your chosen benefits. Submit these forms to the HR department within 30 days of your start date.\n",
      "Designate beneficiaries: If applicable, designate beneficiaries for your life insurance and retirement plans.\n",
      "Getting Settled in Your Workspace\n",
      "To help you feel comfortable and productive in your new workspace, take the following steps:\n",
      "\n",
      "\n",
      "Score: 0.6890812\n",
      "\n",
      "---\n"
     ]
    }
   ],
   "source": [
    "response = client.search(\n",
    "  index=INDEX_NAME, \n",
    "  knn={\n",
    "    \"inner_hits\": {\n",
    "      \"size\": 1,\n",
    "      \"_source\": False,\n",
    "      \"fields\": [\n",
    "        \"passages.text\"\n",
    "      ]\n",
    "    },\n",
    "    \"field\": \"passages.vector.predicted_value\",\n",
    "    \"k\": 5,\n",
    "    \"num_candidates\": 100,\n",
    "    \"query_vector_builder\": {\n",
    "      \"text_embedding\": {\n",
    "        \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n",
    "        \"model_text\": \"Whats the work from home policy?\"\n",
    "      }\n",
    "    }\n",
    "  }\n",
    ")\n",
    "\n",
    "pretty_response(response)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e741acbd-6922-4601-943a-cbe0a0aa0abd",
   "metadata": {},
   "source": [
    "# With Langchain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "4ba5b12f-8cea-4073-97aa-3c103ee04bac",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Doc title: Work From Home Policy\n",
      "Text:\n",
      "Effective: March 2020\n",
      "Purpose\n",
      "\n",
      "The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\n",
      "Scope\n",
      "\n",
      "This policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\n",
      "Eligibility\n",
      "\n",
      "\n",
      "Doc title: Intellectual Property Policy\n",
      "Text:\n",
      "Purpose\n",
      "The purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\n",
      "\n",
      "Scope\n",
      "This policy applies to all employees, including full-time, part-time, temporary, and contract employees.\n",
      "\n",
      "\n",
      "Doc title: Company Vacation Policy\n",
      "Text:\n",
      "Purpose\n",
      "\n",
      "The purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\n",
      "Scope\n",
      "\n",
      "This policy applies to all full-time and part-time employees who have completed their probationary period.\n",
      "Vacation Accrual\n",
      "\n",
      "\n",
      "Doc title: Wfh Policy Update May 2023\n",
      "Text:\n",
      "As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from langchain.vectorstores.elasticsearch import ElasticsearchStore, ApproxRetrievalStrategy\n",
    "from typing import List, Union\n",
    "from langchain_core.documents import Document\n",
    "\n",
    "class CustomRetrievalStrategy(ApproxRetrievalStrategy):\n",
    "\n",
    "    def query(\n",
    "      self,\n",
    "      query: Union[str, None],\n",
    "      filter: List[dict],\n",
    "      **kwargs,\n",
    "    ):\n",
    "                \n",
    "      es_query = {\n",
    "        \"knn\": {\n",
    "          \"inner_hits\": {\n",
    "              \"_source\": False,\n",
    "              \"fields\": [\n",
    "                  \"passages.text\"\n",
    "              ]\n",
    "          },\n",
    "          \"field\": \"passages.vector.predicted_value\",\n",
    "          \"filter\": filter,\n",
    "          \"k\": 5,\n",
    "          \"num_candidates\": 100,\n",
    "          \"query_vector_builder\": {\n",
    "            \"text_embedding\": {\n",
    "              \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n",
    "              \"model_text\": query\n",
    "            }\n",
    "          }\n",
    "        }\n",
    "      }\n",
    "\n",
    "      return es_query\n",
    "    \n",
    "\n",
    "vector_store = ElasticsearchStore(\n",
    "    index_name=INDEX_NAME,\n",
    "    es_connection=client,\n",
    "    query_field=\"content\",\n",
    "    strategy=CustomRetrievalStrategy(),\n",
    ")\n",
    "\n",
    "def doc_builder(hit):\n",
    "  passage_hits = hit.get(\"inner_hits\", {}).get(\"passages\", {}).get(\"hits\", {}).get(\"hits\", [])\n",
    "  page_content = \"\"\n",
    "  for passage_hit in passage_hits:\n",
    "    passage_fields = passage_hit.get(\"fields\", {}).get(\"passages\", [])[0]\n",
    "    page_content += passage_fields.get(\"text\", [])[0] + \"\\n\\n\"\n",
    "\n",
    "    return Document(\n",
    "      page_content=page_content,\n",
    "      metadata=hit[\"_source\"][\"metadata\"],\n",
    "    )\n",
    "\n",
    "results = vector_store.similarity_search(query=\"Whats the work from home policy?\", doc_builder=doc_builder)\n",
    "for result in results:\n",
    "    print(f'Doc title: {result.metadata[\"name\"]}')\n",
    "    print(f'Text:\\n{result.page_content}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73758cf9-d78e-4a70-82c9-84b444753fc5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python_3.11",
   "language": "python",
   "name": "python_3.11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
